if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, broom, skimr, devtools, ggpubr, mgcv, extrafont, mgcViz, here)
theme_set(theme_pubclean(base_size = 14))
# load the function to print GAM figures
source(here("R", "p_gam.R"))
# import
lesion <- read_csv(here("cache", "summarised_lesion_data.csv"))
weather <- read_csv(here("cache", "weather_summary.csv"))
dat <- left_join(lesion, weather, by = c("site" = "Location", "rep" = "Rep"))
For reproducibility purposes, use set.seed().
set.seed(27)
mod1 <-
gam(
mean_pot_count ~ s(distance, k = 5),
data = dat
)
summary(mod1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04751 22.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.926 3.996 78.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.482 Deviance explained = 48.8%
## GCV = 0.76522 Scale est. = 0.75394 n = 334
print(p_gam(x = getViz(mod1)) +
ggtitle("s(Distance)"),
pages = 1)
mod2 <-
gam(
mean_pot_count ~ sum_rain+ s(distance, k = 5),
data = dat
)
summary(mod2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.124951 0.055685 20.202 <2e-16 ***
## sum_rain -0.010853 0.007088 -1.531 0.127
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.926 3.996 78.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.484 Deviance explained = 49.1%
## GCV = 0.76443 Scale est. = 0.75086 n = 334
print(p_gam(x = getViz(mod2)) +
ggtitle("s(Distance) + Precipitation"),
pages = 1)
mod3 <-
gam(mean_pot_count ~ mws + s(distance, k = 5),
data = dat)
summary(mod3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ mws + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.64401 0.11825 5.446 1.01e-07 ***
## mws 0.12273 0.03059 4.012 7.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.929 3.996 81.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.504 Deviance explained = 51.2%
## GCV = 0.73389 Scale est. = 0.72086 n = 334
print(p_gam(x = getViz(mod3)) +
ggtitle("s(Distance) + Windspeed"),
pages = 1)
mod4 <-
gam(mean_pot_count ~ sum_rain + mws + s(distance, k = 5),
data = dat)
summary(mod4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + mws + s(distance, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.674729 0.118523 5.693 2.78e-08 ***
## sum_rain -0.014719 0.006968 -2.112 0.0354 *
## mws 0.131154 0.030694 4.273 2.53e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.93 3.996 82.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.509 Deviance explained = 51.8%
## GCV = 0.72844 Scale est. = 0.71333 n = 334
print(p_gam(x = getViz(mod4)) +
ggtitle("s(Distance) + Windspeed + Precipitation"),
pages = 1)
mod5 <-
gam(
mean_pot_count ~ sum_rain + s(distance + mws, k = 5),
data = dat
)
## Warning in term[i] <- attr(terms(reformulate(term[i])), "term.labels"): number
## of items to replace is not a multiple of replacement length
summary(mod5)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance + mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.124951 0.055685 20.202 <2e-16 ***
## sum_rain -0.010853 0.007088 -1.531 0.127
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.926 3.996 78.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.484 Deviance explained = 49.1%
## GCV = 0.76443 Scale est. = 0.75086 n = 334
print(p_gam(x = getViz(mod5)) +
ggtitle("s(Distance + Windspeed) + Precipitation"),
pages = 1)
mod6 <-
gam(
mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5),
data = dat
)
summary(mod6)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.18789 0.07152 16.608 <2e-16 ***
## sum_rain -0.02613 0.01379 -1.895 0.059 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.938 3.997 93.81 < 2e-16 ***
## s(mws) 3.932 3.997 15.97 3.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.64971 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod6)) +
ggtitle("s(Distance) + s(Windspeed) + Precipitation"),
pages = 1)
mod7 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5),
data = dat
)
summary(mod7)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04366 24.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.937 3.997 92.96 < 2e-16 ***
## s(mws) 3.917 3.995 16.04 6.03e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.562 Deviance explained = 57.3%
## GCV = 0.65392 Scale est. = 0.63659 n = 334
print(p_gam(x = getViz(mod7)) +
ggtitle("s(Distance) + s(Windspeed)"),
pages = 1)
mod8 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain, k = 5),
data = dat
)
summary(mod8)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04345 24.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.938 3.997 93.802 <2e-16 ***
## s(mws) 2.977 3.051 6.713 0.0161 *
## s(sum_rain) 1.931 1.942 1.979 0.1899
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.64966 Scale est. = 0.63051 n = 334
print(p_gam(x = getViz(mod8)) +
ggtitle("s(Distance) + s(Windspeed) + s(Precipitation)"),
pages = 1)
mod9 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5),
data = dat
)
summary(mod9)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08024 0.04594 23.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.931 3.996 83.82 < 2e-16 ***
## s(sum_rain) 2.030 2.198 10.76 2.31e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.515 Deviance explained = 52.4%
## GCV = 0.71995 Scale est. = 0.70494 n = 334
print(p_gam(x = getViz(mod9)) +
ggtitle("s(Distance) + s(Precipitation)"),
pages = 1)
mod10 <-
gam(
mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws,
data = dat
)
summary(mod10)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.8631 1.4637 -1.956 0.0513 .
## mws 1.1095 0.4116 2.695 0.0074 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.936 3.997 93.81 < 2e-16 ***
## s(sum_rain) 3.819 3.967 11.16 9.66e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 57.8%
## GCV = 0.64947 Scale est. = 0.6305 n = 334
print(p_gam(x = getViz(mod10)) +
ggtitle("s(Distance) + s(Precipitation) + Windspeed"),
pages = 1)
This is the same as mod8 but using family = tw(), see ?family.mgcv for more on the families. The Tweedie distribution is used where the distribution has a positive mass at zero, but is continuous unlike the Poisson distribution that requires count data. The data visualisation shows clearly that the mean pot count data have this shape.
mod11.0 <-
gam(
mean_pot_count ~ s(distance, k = 5) +
s(mws, k = 5) +
s(sum_rain, k = 5),
data = dat,
family = tw()
)
summary(mod11.0)
##
## Family: Tweedie(p=1.044)
## Link function: log
##
## Formula:
## mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22837 0.04099 -5.571 5.32e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.497 3.855 123.761 < 2e-16 ***
## s(mws) 2.938 3.054 13.497 1.39e-05 ***
## s(sum_rain) 1.901 1.928 5.573 0.0132 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.673 Deviance explained = 61.2%
## -REML = 309.77 Scale est. = 0.36396 n = 334
print(p_gam(x = getViz(mod11.0)) +
ggtitle("s(Distance) + s(Windspeed) + s(Precipitation), family = tw()"),
pages = 1)
mod11.1 <-
gam(
mean_pot_count ~ s(distance, k = 5, bs = "ts") +
s(mws, k = 5, bs = "ts") +
s(sum_rain, k = 5, bs = "ts"),
data = dat,
family = tw()
)
summary(mod11.1)
##
## Family: Tweedie(p=1.044)
## Link function: log
##
## Formula:
## mean_pot_count ~ s(distance, k = 5, bs = "ts") + s(mws, k = 5,
## bs = "ts") + s(sum_rain, k = 5, bs = "ts")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22588 0.04094 -5.518 7.02e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(distance) 3.250 4 117.921 < 2e-16 ***
## s(mws) 2.875 4 13.403 1.9e-12 ***
## s(sum_rain) 1.839 4 2.995 0.000496 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.671 Deviance explained = 61%
## -REML = 322.33 Scale est. = 0.36412 n = 334
print(
p_gam(x = getViz(mod11.1)) +
ggtitle(
"s(Distance, bs = 'ts') + s(Windspeed, bs = 'ts')\n+ s(Precipitation, bs = 'ts'), family = tw()"
),
pages = 1
)
This model, same structure as mod11.0, uses thin-plate splines to shrink the coefficients of the smooth to zero when possible.
models <- list(mod1 = mod1,
mod2 = mod2,
mod3 = mod3,
mod4 = mod4,
mod5 = mod5,
mod6 = mod6,
mod7 = mod7,
mod8 = mod8,
mod9 = mod9,
mod10 = mod10,
mod11.0 = mod11.0,
mod11.1 = mod11.1
)
map_df(models, glance, .id = "model") %>%
arrange(AIC)
## # A tibble: 12 x 7
## model df logLik AIC BIC deviance df.residual
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mod11.0 9.34 -320. 663. 708. 141. 325.
## 2 mod11.1 8.96 -322. 667. 712. 141. 325.
## 3 mod10 9.76 -392. 805. 846. 204. 324.
## 4 mod8 9.85 -392. 805. 847. 204. 324.
## 5 mod6 9.87 -392. 806. 847. 204. 324.
## 6 mod7 8.85 -394. 808. 845. 207. 325.
## 7 mod9 6.96 -412. 840. 870. 231. 327.
## 8 mod4 6.93 -414. 844. 874. 233. 327.
## 9 mod3 5.93 -416. 846. 873. 236. 328.
## 10 mod2 5.93 -423. 860. 886. 246. 328.
## 11 mod5 5.93 -423. 860. 886. 246. 328.
## 12 mod1 4.93 -424. 860. 883. 248. 329.
enframe(c(
mod1 = summary(mod1)$r.sq,
mod2 = summary(mod2)$r.sq,
mod3 = summary(mod3)$r.sq,
mod4 = summary(mod4)$r.sq,
mod5 = summary(mod5)$r.sq,
mod6 = summary(mod6)$r.sq,
mod7 = summary(mod7)$r.sq,
mod8 = summary(mod8)$r.sq,
mod9 = summary(mod9)$r.sq,
mod10 = summary(mod10)$r.sq,
mod11.0 = summary(mod11.0)$r.sq,
mod11.1 = summary(mod11.1)$r.sq
)) %>%
arrange(desc(value))
## # A tibble: 12 x 2
## name value
## <chr> <dbl>
## 1 mod11.0 0.673
## 2 mod11.1 0.671
## 3 mod10 0.566
## 4 mod6 0.566
## 5 mod8 0.566
## 6 mod7 0.562
## 7 mod9 0.515
## 8 mod4 0.509
## 9 mod3 0.504
## 10 mod2 0.484
## 11 mod5 0.484
## 12 mod1 0.482
anova(mod1,
mod2,
mod3,
mod4,
mod5,
mod6,
mod7,
mod8,
mod9,
mod10,
mod11.0,
mod11.1,
test = "F")
## Analysis of Deviance Table
##
## Model 1: mean_pot_count ~ s(distance, k = 5)
## Model 2: mean_pot_count ~ sum_rain + s(distance, k = 5)
## Model 3: mean_pot_count ~ mws + s(distance, k = 5)
## Model 4: mean_pot_count ~ sum_rain + mws + s(distance, k = 5)
## Model 5: mean_pot_count ~ sum_rain + s(distance + mws, k = 5)
## Model 6: mean_pot_count ~ sum_rain + s(distance, k = 5) + s(mws, k = 5)
## Model 7: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5)
## Model 8: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
## Model 9: mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5)
## Model 10: mean_pot_count ~ s(distance, k = 5) + s(sum_rain, k = 5) + mws
## Model 11: mean_pot_count ~ s(distance, k = 5) + s(mws, k = 5) + s(sum_rain,
## k = 5)
## Model 12: mean_pot_count ~ s(distance, k = 5, bs = "ts") + s(mws, k = 5,
## bs = "ts") + s(sum_rain, k = 5, bs = "ts")
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 329.00 248.10
## 2 328.00 246.34 1.00004467 1.76 4.8398 0.028515 *
## 3 328.00 236.49 0.00032438 9.84 83384.0223 2.780e-11 ***
## 4 327.00 233.31 1.00009343 3.19 8.7512 0.003321 **
## 5 328.00 246.34 -1.00041782 -13.03 35.7856 5.797e-09 ***
## 6 324.01 204.37 3.99780494 41.97 28.8451 < 2.2e-16 ***
## 7 325.01 206.98 -1.00194276 -2.62 7.1733 0.007744 **
## 8 324.01 204.38 0.99830140 2.60 7.1576 0.007873 **
## 9 326.81 230.54 -2.79549912 -26.16 25.7123 3.516e-14 ***
## 10 324.04 204.44 2.76976222 26.11 25.8972 3.572e-14 ***
## 11 323.66 639.52 0.37229260 -435.08
## 12 323.68 644.19 -0.01578706 -4.66 811.7846 2.141e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod11.0_vis <- getViz(mod11.0)
check(mod11.0_vis,
a.qq = list(method = "tnorm",
a.cipoly = list(fill = "light blue")),
a.respoi = list(size = 0.5),
a.hist = list(bins = 10))
##
## Method: REML Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-0.0006677089,-7.453262e-09]
## (score 309.7676 & scale 0.3639647).
## Hessian positive definite, eigenvalue range [0.3939723,2978.454].
## Model rank = 13 / 13
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(distance) 4.00 3.50 0.86 0.010 **
## s(mws) 4.00 2.94 0.89 0.045 *
## s(sum_rain) 4.00 1.90 0.90 0.045 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod11.1_vis <- getViz(mod11.1)
check(mod11.1_vis,
a.qq = list(method = "tnorm",
a.cipoly = list(fill = "light blue")),
a.respoi = list(size = 0.5),
a.hist = list(bins = 10))
##
## Method: REML Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-6.815834e-09,5.749888e-08]
## (score 322.3273 & scale 0.3641249).
## Hessian positive definite, eigenvalue range [0.6649335,2979.663].
## Model rank = 13 / 13
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(distance) 4.00 3.25 0.86 0.010 **
## s(mws) 4.00 2.88 0.88 0.020 *
## s(sum_rain) 4.00 1.84 0.89 0.025 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This model, mod11.0, mean_pot_count ~ s(Distance) + s(Windspeed) + s(Precipitation) - family = tw(), lacks enough numbers of basis functions for two of three predictors, distance, wsp, but it’s close. Realistically we need to increase the k values for these predictors, but we’re lacking the data to do this.